Here, we explore the TH2 dataset, to try and understand if it’s only the Fluidigm data that is somewhat off.
The tricky part of this analysis is that we don’t know the ground truth for this dataset, so it’s not easy to understand which representaion is better. However, we do not see the issues with W seen in the Fluidigm data, suggesting that that is a problem specific to that dataset.
As a first pass, I will only focus on the cells that passed the QC filters (by the original authors) and that were not doublets.
Select cells, remove ERCC spike-ins, filter out the genes that do not have at least 10 counts in at least 10 cells.
data("th2")
th2_core <- th2[grep("^ERCC-", rownames(th2), invert = TRUE),
which(colData(th2)$single=="OK" & colData(th2)$Buettner_filter=="PASS")]
filter <- apply(assay(th2_core)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 9365
To speed up the computations, I will focus on the top 1,000 most variable genes.
th2_core <- th2_core[filter,]
core <- assay(th2_core)
vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]
First, let’s look at PCA (of the log counts) for reference.
detection_rate <- colSums(core>0)
coverage <- colSums(core)
gata3 <- log2(assay(th2_core)["Gata3",])
pca <- prcomp(t(log1p(core)))
data.frame(pca$x) %>% ggplot(aes(PC1, PC2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()
df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)
print(cor(df, method="spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.000000000 -0.002629017 0.95570040 0.5829114
## PC2 -0.002629017 1.000000000 -0.04921096 -0.2084226
## detection_rate 0.955700402 -0.049210957 1.00000000 0.4791100
## coverage 0.582911392 -0.208422590 0.47910999 1.0000000
print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
## user system elapsed
## 103.427 22.829 47.788
Plot the results with cells colored according to Gata3, mimicking the figure from the original paper.
data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()
One interpretation of the model is that \(\gamma_mu\) will act as a “size factor”. Here, we check whether it captures sequencing depth and/or detection rate.
#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.00000000 0.020886076 -0.48763389 0.14491237
## W2 0.02088608 1.000000000 0.05333496 -0.06962025
## gamma_mu -0.48763389 0.053334956 1.00000000 -0.10932327
## gamma_pi 0.14491237 -0.069620253 -0.10932327 1.00000000
## detection_rate -0.48564630 -0.004187131 0.69492980 -0.44874112
## coverage -0.17653359 0.118719572 0.83259494 -0.28188900
## detection_rate coverage
## W1 -0.485646297 -0.1765336
## W2 -0.004187131 0.1187196
## gamma_mu 0.694929799 0.8325949
## gamma_pi -0.448741123 -0.2818890
## detection_rate 1.000000000 0.4791100
## coverage 0.479109992 1.0000000
\(\gamma_pi\) clearly captures the detection rate and \(\gamma_mu\) clearly captures the coverage. Interestingly, the two estimates are positively correlated.
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vall)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19)
plot(rowMeans(log1p(getMu(zinb_Vall))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19)
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vall@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vall@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vall@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vall@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_Vnone <- zinbFit(core, ncores = 3, K = 2, V=matrix(0, ncol=1, nrow=NROW(core)))))
## user system elapsed
## 104.388 21.766 47.100
data.frame(W1=zinb_Vnone@W[,1], W2=zinb_Vnone@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()
df <- data.frame(W1 = zinb_Vnone@W[,1], W2 = zinb_Vnone@W[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)
print(cor(df, method="spearman"))
## W1 W2 detection_rate coverage
## W1 1.0000000 0.1125365 -0.4053970 -0.1227605
## W2 0.1125365 1.0000000 -0.6382453 -0.8917235
## detection_rate -0.4053970 -0.6382453 1.0000000 0.4791100
## coverage -0.1227605 -0.8917235 0.4791100 1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vnone)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19)
plot(rowMeans(log1p(getMu(zinb_Vnone))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19)
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vnone@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vnone@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vnone@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vnone@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
V <- cbind(rep(0, NROW(core)), rep(1, NROW(core)))
print(system.time(zinb_Vmu <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=2L, which_V_pi=1L)))
## user system elapsed
## 128.814 26.974 58.236
data.frame(W1=zinb_Vmu@W[,1], W2=zinb_Vmu@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()
df <- data.frame(W1 = zinb_Vmu@W[,1], W2 = zinb_Vmu@W[,2], gamma_mu = zinb_Vmu@gamma_mu[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu detection_rate
## W1 1.00000000 0.08210808 -0.63936222 -0.65269334
## W2 0.08210808 1.00000000 -0.02877313 -0.05977616
## gamma_mu -0.63936222 -0.02877313 1.00000000 0.71618192
## detection_rate -0.65269334 -0.05977616 0.71618192 1.00000000
## coverage -0.32921130 0.13899708 0.87127556 0.47910999
## coverage
## W1 -0.3292113
## W2 0.1389971
## gamma_mu 0.8712756
## detection_rate 0.4791100
## coverage 1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vmu)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19)
plot(rowMeans(log1p(getMu(zinb_Vmu))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19)
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vmu@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vmu@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vmu@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vmu@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
print(system.time(zinb_Vpi <- zinbFit(core, ncores = 3, K = 2, V=V, which_V_mu=1L, which_V_pi=2L)))
## user system elapsed
## 133.406 36.809 63.523
data.frame(W1=zinb_Vpi@W[,1], W2=zinb_Vpi@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()
df <- data.frame(W1 = zinb_Vpi@W[,1], W2 = zinb_Vpi@W[,2], gamma_pi = zinb_Vpi@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19)
print(cor(df, method="spearman"))
## W1 W2 gamma_pi detection_rate
## W1 1.000000000 0.234104187 0.007327167 0.1082811
## W2 0.234104187 1.000000000 0.007521908 -0.5629865
## gamma_pi 0.007327167 0.007521908 1.000000000 -0.4664147
## detection_rate 0.108281146 -0.562986496 -0.466414709 1.0000000
## coverage 0.594547225 -0.421056475 -0.157205453 0.4791100
## coverage
## W1 0.5945472
## W2 -0.4210565
## gamma_pi -0.1572055
## detection_rate 0.4791100
## coverage 1.0000000
par(mfrow=c(1, 2))
plot(rowMeans(getPi(zinb_Vpi)), detection_rate, xlab="Average estimated Pi", ylab="detection rate", pch=19)
plot(rowMeans(log1p(getMu(zinb_Vpi))), coverage, xlab="Average estimated log Mu", ylab="coverage", pch=19)
It is worth looking at the other estimates as well, namely, \(\beta\) and \(\alpha\).
par(mfrow=c(2, 2))
boxplot(zinb_Vpi@beta_mu[1,], main="Beta_Mu")
boxplot(zinb_Vpi@beta_pi[1,], main="Beta_Pi")
plot(t(zinb_Vpi@alpha_mu), xlab="Alpha_Mu 1", ylab="Alpha_Mu 2", main="Alpha_Mu")
plot(t(zinb_Vpi@alpha_pi), xlab="Alpha_Pi 1", ylab="Alpha_Pi 2", main="Alpha_Pi")
## write matrices to file to feed to ZIFA in python
write.csv(log1p(core), file="logcounts_th2.csv")
core <- assay(th2_core)
dim(core)
## [1] 9365 79
print(system.time(zinb_all <- zinbFit(core, ncores = 3, K = 2)))
## user system elapsed
## 671.873 71.356 296.410
data.frame(W1=zinb_all@W[,1], W2=zinb_all@W[,2]) %>% ggplot(aes(W1, W2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()
Interestingly, when using all the genes, things still work.
zifa_res <- read.csv("zifa_th2_full.csv", header=FALSE)
data.frame(Z1=zifa_res[,1], Z2=zifa_res[,2]) %>% ggplot(aes(Z1, Z2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()
zifa_res <- read.csv("zifa_th2.csv", header=FALSE)
data.frame(Z1=zifa_res[,1], Z2=zifa_res[,2]) %>% ggplot(aes(Z1, Z2)) + geom_point(aes(color=gata3), size=3) + scale_colour_gradient(low="blue", high="green") + theme_classic()